#Importing packages
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import holidays
import gc
import dill
from datetime import date
from pandas.tseries.holiday import USFederalHolidayCalendar as calendar
#Setting large figure size for Seaborn
sns.set(rc={'figure.figsize':(11.7,8.27),"font.size":20,"axes.titlesize":20,"axes.labelsize":18})
from sklearn.base import clone
import numpy.random as rng
from sklearn.utils import check_random_state
from sklearn.decomposition import PCA
from functions import *
from sklearn.model_selection import KFold, ParameterGrid
from sklearn.cluster import KMeans, DBSCAN
from sklearn.metrics import silhouette_score, adjusted_rand_score
import plotly.express as px
import plotly.graph_objects as go
#Importing MaxNLocator to restrict matplotlib scale to integers
from matplotlib.ticker import MaxNLocator
data_dir = 'data'
def load_oracle_files(dir: str):
orders = pd.read_excel(dir+'/Orders.xlsx')
customers = pd.read_excel(dir+'/Customers.xlsx')
return orders, customers
orders, customers = load_oracle_files(data_dir)
print(orders.shape)
orders.head()
(131706, 8)
| id | Date | Customer_ID | Transaction_ID | SKU_Category | SKU | Quantity | Sales_Amount | |
|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 2021-01-02 | 2547 | 1 | X52 | 0EM7L | 1.0 | 3.13 |
| 1 | 2 | 2021-01-02 | 822 | 2 | 2ML | 68BRQ | 1.0 | 5.46 |
| 2 | 3 | 2021-01-02 | 3686 | 3 | 0H2 | CZUZX | 1.0 | 6.35 |
| 3 | 4 | 2021-01-02 | 3719 | 4 | 0H2 | 549KK | 1.0 | 5.59 |
| 4 | 5 | 2021-01-02 | 9200 | 5 | 0H2 | K8EHH | 1.0 | 6.88 |
print(customers.shape)
customers.head()
(22625, 4)
| Customer_ID | GENDER | AGE | GEOGRAPHY | |
|---|---|---|---|---|
| 0 | 2547 | F | 48 | Spain |
| 1 | 822 | F | 39 | Germany |
| 2 | 3686 | F | 56 | Greece |
| 3 | 3719 | F | 49 | Spain |
| 4 | 9200 | M | 18 | Italy |
orders.describe()
| id | Customer_ID | Transaction_ID | Quantity | Sales_Amount | |
|---|---|---|---|---|---|
| count | 131706.000000 | 131706.000000 | 131706.000000 | 131706.000000 | 131706.000000 |
| mean | 65853.500000 | 12386.450367 | 32389.604187 | 1.485318 | 11.981524 |
| std | 38020.391614 | 6086.447552 | 18709.901238 | 3.872666 | 19.359699 |
| min | 1.000000 | 1.000000 | 1.000000 | 0.010000 | 0.020000 |
| 25% | 32927.250000 | 7349.000000 | 16134.000000 | 1.000000 | 4.230000 |
| 50% | 65853.500000 | 13496.000000 | 32620.000000 | 1.000000 | 6.920000 |
| 75% | 98779.750000 | 17306.000000 | 48548.000000 | 1.000000 | 12.330000 |
| max | 131706.000000 | 22625.000000 | 64682.000000 | 400.000000 | 707.730000 |
orders[orders.Quantity == 0]
| id | Date | Customer_ID | Transaction_ID | SKU_Category | SKU | Quantity | Sales_Amount |
|---|
orders[orders.Quantity > 100].head(2)
| id | Date | Customer_ID | Transaction_ID | SKU_Category | SKU | Quantity | Sales_Amount | |
|---|---|---|---|---|---|---|---|---|
| 17084 | 17085 | 2021-02-19 | 6869 | 8186 | LGI | VWLV9 | 176.0 | 74.8 |
| 64680 | 64681 | 2021-06-29 | 17025 | 32069 | LGI | VWLV9 | 176.0 | 74.8 |
customers.GEOGRAPHY.value_counts()
Germany 7970 Italy 4480 France 2644 Greece 2269 UK 2264 Spain 1651 Netherlands 1347 Name: GEOGRAPHY, dtype: int64
customers.GENDER.value_counts()
F 11328 M 11297 Name: GENDER, dtype: int64
def clean_data(df: pd.DataFrame, quantity_col: str = 'Quantity'):
df = df.loc[df[quantity_col] % 1 == 0]
df[quantity_col] = df[quantity_col].apply(lambda x: int(x))
return df.set_index('id')
orders = clean_data(orders)
orders.head(2)
/tmp/ipykernel_916/3184141252.py:3: SettingWithCopyWarning: A value is trying to be set on a copy of a slice from a DataFrame. Try using .loc[row_indexer,col_indexer] = value instead See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy df[quantity_col] = df[quantity_col].apply(lambda x: int(x))
| Date | Customer_ID | Transaction_ID | SKU_Category | SKU | Quantity | Sales_Amount | |
|---|---|---|---|---|---|---|---|
| id | |||||||
| 1 | 2021-01-02 | 2547 | 1 | X52 | 0EM7L | 1 | 3.13 |
| 2 | 2021-01-02 | 822 | 2 | 2ML | 68BRQ | 1 | 5.46 |
item_df = orders.groupby("Customer_ID").agg(first_date=('Date','min'), last_date=('Date','max'), diff_items_bought = ('SKU', 'nunique'),
most_cat = ('SKU_Category', lambda x: x.value_counts().index[0]), most_item_SKU=('SKU', lambda x: x.value_counts().index[0]))
item_df.head(2)
| first_date | last_date | diff_items_bought | most_cat | most_item_SKU | |
|---|---|---|---|---|---|
| Customer_ID | |||||
| 1 | 2021-01-22 | 2021-01-22 | 2 | 0H2 | 6OUVC |
| 2 | 2021-03-24 | 2021-06-19 | 2 | TVL | 2SLS0 |
item_df.most_cat.value_counts(normalize=True)[0:10].sum()
# Too many categories to efficiently use this data
0.4176481005932879
standard_functions = ['min', 'max', 'sum', 'mean', 'std', 'median']
aggregations = {
'Customer_ID': ['count'],
'order_value' : standard_functions,
'diff_items' : standard_functions,
'date': ['min', 'max'],
'total_items': standard_functions
}
def aggregate_transactions(df: pd.DataFrame, transact_col: str):
results = df.groupby(transact_col).agg(Customer_ID = ('Customer_ID', 'max'), diff_items = ('SKU', 'nunique'), total_items = ('Quantity', 'sum'),
order_value = ('Sales_Amount','sum'), date = ('Date', 'max'))
return results.dropna()
def aggregate_results(df: pd.DataFrame, index_col: str, aggregations: dict):
results = df.groupby(index_col).agg(aggregations)
#Renaming columns to remove multi indexing
results.columns = pd.Index([e[0] + '_' + e[1] for e in results.columns])
results.rename(columns={'Customer_ID_count': 'number_orders'}, inplace=True)
# Replacing null standard deviation due to single order clients with 0
results = results.fillna(0)
return results
def compile_customer_info(cust_df: pd.DataFrame, cust_transact: pd.DataFrame, item_df):
cust_df = cust_df.set_index('Customer_ID')
# Merge our customers data with the aggregated customer transactions data
results = pd.merge(cust_df, cust_transact, left_index=True, right_index=True, how='inner')
results = pd.merge(results, item_df[['diff_items_bought']], left_index=True, right_index=True, how='left')
# Dropping the sum of different items which makes no sense since it is a double aggregations
results.drop(columns='diff_items_sum', inplace=True)
# Bin the age column
results["age_bin"] = pd.cut(results["AGE"], bins = [18 , 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 80], right = False,
labels=["18-24", "25-29", "30-34", "35-39", "40-44", "45-49", "50-54", "55-59", "60-64", "65-69", "70+"])
results.rename(columns={'order_value_sum': 'total_revenue', 'total_items_sum': 'number_items_bought'}, inplace=True)
return results
def is_holiday(calendar, date) -> bool:
return date in calendar
def define_holidays(df: pd.DataFrame, country_col: str="GEOGRAPHY") -> None :
countries = df[country_col].unique()
for c in countries:
calendar = holidays.country_holidays(c)
df.loc[df[country_col] == c, 'is_holiday'] = df.apply(lambda x: is_holiday(calendar, x.date), axis=1)
return df
def compile_transaction_info(trans_df, customers_df):
results = pd.merge(trans_df, customers_df, left_on='Customer_ID', right_index=True, how='left')
results.drop(columns='Customer_ID_y', inplace=True)
results.rename(columns={'Customer_ID_x': 'Customer_ID'}, inplace= True)
# Adding month field and is_holiday boolean field
results["month"] = results["date"].dt.month
results["is_holiday"] = False
# Dropping transactions with no customer records
results.dropna(inplace=True)
results = define_holidays(results)
return results
transactions = aggregate_transactions(orders, transact_col='Transaction_ID')
customer_transact = aggregate_results(transactions, index_col='Customer_ID', aggregations=aggregations)
customer_transact.head(2)
| number_orders | order_value_min | order_value_max | order_value_sum | order_value_mean | order_value_std | order_value_median | diff_items_min | diff_items_max | diff_items_sum | ... | diff_items_std | diff_items_median | date_min | date_max | total_items_min | total_items_max | total_items_sum | total_items_mean | total_items_std | total_items_median | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Customer_ID | |||||||||||||||||||||
| 1 | 1 | 16.29 | 16.29 | 16.29 | 16.290 | 0.000000 | 16.290 | 2 | 2 | 2 | ... | 0.0 | 2.0 | 2021-01-22 | 2021-01-22 | 2 | 2 | 2 | 2.0 | 0.0 | 2.0 |
| 2 | 2 | 7.77 | 15.00 | 22.77 | 11.385 | 5.112382 | 11.385 | 1 | 1 | 2 | ... | 0.0 | 1.0 | 2021-03-24 | 2021-06-19 | 1 | 1 | 2 | 1.0 | 0.0 | 1.0 |
2 rows × 21 columns
customers_df = compile_customer_info(customers, customer_transact, item_df)
customers_df.head(2)
| GENDER | AGE | GEOGRAPHY | number_orders | order_value_min | order_value_max | total_revenue | order_value_mean | order_value_std | order_value_median | ... | date_min | date_max | total_items_min | total_items_max | number_items_bought | total_items_mean | total_items_std | total_items_median | diff_items_bought | age_bin | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Customer_ID | |||||||||||||||||||||
| 2547 | F | 48 | Spain | 7 | 3.13 | 39.62 | 121.36 | 17.337143 | 11.520052 | 13.48 | ... | 2021-01-02 | 2021-04-16 | 1 | 6 | 22 | 3.142857 | 1.951800 | 3.0 | 14 | 45-49 |
| 822 | F | 39 | Germany | 3 | 5.46 | 8.59 | 20.40 | 6.800000 | 1.612793 | 6.35 | ... | 2021-01-02 | 2021-02-03 | 1 | 3 | 5 | 1.666667 | 1.154701 | 1.0 | 5 | 35-39 |
2 rows × 25 columns
transactions = compile_transaction_info(transactions, customers)
transactions.head(2)
| Customer_ID | diff_items | total_items | order_value | date | GENDER | AGE | GEOGRAPHY | month | is_holiday | |
|---|---|---|---|---|---|---|---|---|---|---|
| Transaction_ID | ||||||||||
| 1 | 2547 | 1 | 1 | 3.13 | 2021-01-02 | F | 49.0 | Spain | 1 | False |
| 2 | 822 | 1 | 1 | 5.46 | 2021-01-02 | M | 26.0 | Italy | 1 | False |
customers_df.describe()
| AGE | number_orders | order_value_min | order_value_max | total_revenue | order_value_mean | order_value_std | order_value_median | diff_items_min | diff_items_max | diff_items_mean | diff_items_std | diff_items_median | total_items_min | total_items_max | number_items_bought | total_items_mean | total_items_std | total_items_median | diff_items_bought | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 22586.000000 | 22586.000000 | 22586.000000 | 22586.000000 | 22586.000000 | 22586.000000 | 22586.000000 | 22586.000000 | 22586.000000 | 22586.000000 | 22586.000000 | 22586.000000 | 22586.000000 | 22586.000000 | 22586.000000 | 22586.000000 | 22586.000000 | 22586.000000 | 22586.000000 | 22586.000000 |
| mean | 37.917692 | 2.856017 | 14.252760 | 34.496467 | 69.591381 | 21.716699 | 9.275838 | 19.837490 | 1.520411 | 2.563579 | 1.912922 | 0.497635 | 1.817719 | 1.976667 | 4.168644 | 8.652395 | 2.747276 | 1.014708 | 2.538387 | 4.710263 |
| std | 12.228208 | 3.995979 | 24.107055 | 50.808639 | 152.002250 | 28.255392 | 21.119069 | 27.511457 | 1.088696 | 2.018458 | 1.196527 | 0.868402 | 1.205846 | 5.219676 | 9.142976 | 20.986562 | 5.727765 | 3.500516 | 5.679885 | 6.155573 |
| min | 18.000000 | 1.000000 | 0.030000 | 0.140000 | 0.140000 | 0.140000 | 0.000000 | 0.140000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 1.000000 | 1.000000 |
| 25% | 29.000000 | 1.000000 | 4.290000 | 8.870000 | 10.132500 | 7.726875 | 0.000000 | 7.150000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 1.000000 | 1.000000 | 1.000000 | 2.000000 | 1.000000 | 0.000000 | 1.000000 | 1.000000 |
| 50% | 38.000000 | 1.000000 | 7.500000 | 17.410000 | 23.800000 | 13.425000 | 0.000000 | 12.100000 | 1.000000 | 2.000000 | 1.666667 | 0.000000 | 1.500000 | 1.000000 | 2.000000 | 3.000000 | 2.000000 | 0.000000 | 2.000000 | 3.000000 |
| 75% | 46.000000 | 3.000000 | 14.750000 | 37.087500 | 63.007500 | 24.467536 | 9.495995 | 21.370000 | 2.000000 | 3.000000 | 2.133333 | 0.707107 | 2.000000 | 2.000000 | 4.000000 | 8.000000 | 3.000000 | 1.000000 | 3.000000 | 6.000000 |
| max | 75.000000 | 99.000000 | 544.790000 | 707.730000 | 3949.550000 | 544.790000 | 413.303914 | 544.790000 | 18.000000 | 21.000000 | 18.000000 | 12.727922 | 18.000000 | 408.000000 | 408.000000 | 814.000000 | 408.000000 | 140.714249 | 408.000000 | 176.000000 |
def generate_categorical_column_stats(df:pd.DataFrame, columns=list[str], summary_cols=list[str]) -> pd.DataFrame:
results = []
for c in columns:
c_cap = str(c).capitalize()
for val in df[c].unique():
val_cap = str(val).capitalize()
for sum_col in summary_cols:
sum_col_cap = str(sum_col).capitalize()
subset = df[df[c] == val][sum_col]
results.append({'column': c_cap, 'subset': val_cap, 'variable': sum_col_cap, 'statistic': 'Average', 'value': round(np.mean(subset),2)})
results.append({'column': c_cap, 'subset': val_cap, 'variable': sum_col_cap, 'statistic': 'Median', 'value': pd.Series.median(subset)})
results.append({'column': c_cap, 'subset': val_cap, 'variable': sum_col_cap, 'statistic': 'Max', 'value': max(subset)})
results.append({'column': c_cap, 'subset': val_cap, 'variable': sum_col_cap, 'statistic': 'Min', 'value': min(subset)})
return results
sum_cols = [ 'total_revenue','number_items_bought', 'diff_items_bought', 'number_orders']
customer_summary = generate_categorical_column_stats(customers_df, ['age_bin','GEOGRAPHY', 'GENDER'], sum_cols)
geo_sum_cols = ['total_items', 'order_value']
geo_summary = generate_categorical_column_stats(transactions, ['month', 'is_holiday'], geo_sum_cols)
results = pd.DataFrame(customer_summary + geo_summary).sort_values(by=['column', 'subset'])
results.to_csv("data/output_1.csv")
def plot_statistical_data(df: pd.DataFrame, cat_cols: list[str], sum_cols: list[str]) -> None:
for c in cat_cols:
for sum_col in sum_cols:
subset = df[[c,sum_col]]
cat_name = str(c).replace('_', ' ').capitalize()
sum_col_name = str(sum_col).replace('_', ' ').capitalize()
# Sum
subset.groupby(c).sum().reset_index().plot(x=c, y=sum_col, kind="bar", legend=False)
plt.title('Sum of {} by {}'.format(sum_col_name, cat_name))
plt.xlabel(cat_name)
plt.ylabel(sum_col_name)
plt.show()
# Mean
subset.groupby(c).mean().reset_index().plot(x=c, y=sum_col, kind="area", color='orange', legend=False)
plt.title('Average {} by {}'.format(sum_col_name, cat_name))
plt.xlabel(cat_name)
plt.ylabel(sum_col_name)
plt.show()
# Median
subset.groupby(c).median().reset_index().plot(x=c, y=sum_col, kind="area", color = 'yellow', legend=False)
plt.title('Median {} by {}'.format(sum_col_name, cat_name))
plt.xlabel(cat_name)
plt.ylabel(sum_col_name)
plt.show()
# Max
subset.groupby(c).max().reset_index().plot(x=c, y=sum_col, kind="area", color = 'green', legend=False)
plt.title('Max {} by {}'.format(sum_col_name, cat_name))
plt.xlabel(cat_name)
plt.ylabel(sum_col_name)
plt.show()
# Min
subset.groupby(c).min().reset_index().plot(x=c, y=sum_col, kind="area", color = 'purple', legend=False)
plt.title('Min {} by {}'.format(sum_col_name, cat_name))
plt.xlabel(cat_name)
plt.ylabel(sum_col_name)
plt.show()
geo_summary = customers_df.groupby('GEOGRAPHY').sum().reset_index()
plt.pie(geo_summary['total_revenue'], labels=geo_summary['GEOGRAPHY'],
pctdistance = 0.8,
labeldistance = 1.1,
autopct=lambda x: f"{x:.1f}%\n({(x/100)*sum(geo_summary['total_revenue']):.0f} €)",
textprops={"size": 12},
radius = 1.2)
plt.legend(loc="best", bbox_to_anchor=(2.5,0), title="Country")
plt.show()
/tmp/ipykernel_1634/3955889209.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
geo_summary = customers_df.groupby('GEOGRAPHY').sum().reset_index()
customers_df.total_revenue.sum()
1571790.9299999997
plot_statistical_data(customers_df, ['age_bin','GEOGRAPHY', 'GENDER'], sum_cols)
plot_statistical_data(transactions, ['is_holiday'], geo_sum_cols)
#Plotting the profits per age group
customers_df[["age_bin","total_revenue"]].groupby("age_bin").sum().reset_index().plot(x="age_bin", y="total_revenue",kind="bar",legend=False)
plt.title('Total profits by age group')
plt.ylabel('Profits (EUR)')
plt.xlabel('Age Group')
plt.show()
customers_df[["age_bin","total_revenue"]].groupby("age_bin").mean().reset_index().plot(x="age_bin", y="total_revenue",kind="area",legend=False)
plt.title('Average customer revenue by age group')
plt.ylabel('Profits (EUR)')
plt.xlabel('Age Group')
plt.show()
customers_df[["age_bin","total_revenue"]].groupby("age_bin").median().reset_index().plot(x="age_bin", y="total_revenue",kind="area",legend=False)
plt.title('Median customer revenue by age group')
plt.ylabel('Profits (EUR)')
plt.xlabel('Age Group')
plt.show()
customers_df[["age_bin","number_items_bought"]].groupby("age_bin").mean().reset_index().plot(x="age_bin", y="number_items_bought",kind="area",legend=False)
plt.title('Average number of items bought by age')
plt.ylabel('Profits (EUR)')
plt.xlabel('Age Group')
plt.show()
transactions['weekday'] = transactions.date.dt.day_name()
cats = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
transactions['weekday'] = pd.Categorical(transactions['weekday'], categories=cats, ordered=True)
transactions.groupby('weekday').sum(numeric_only=True).reset_index().plot(x='weekday', y='order_revenue', kind='bar', legend=False)
plt.xlabel('Day of Week')
plt.ylabel('Total Revenue (EUR)')
plt.show()
transactions.groupby('weekday').mean(numeric_only=True).reset_index().plot(x='weekday', y='order_revenue', kind='bar', legend=False)
plt.xlabel('Day of Week')
plt.ylabel('Average Order Revenue (EUR)')
plt.show()
transactions.groupby('month').sum(numeric_only=True).reset_index().plot(x='month', y='order_revenue', kind='area', legend=False)
plt.xlabel('Month')
plt.ylabel('Total Revenue (EUR)')
plt.show()
transactions.groupby('month').mean(numeric_only=True).reset_index().plot(x='month', y='order_revenue', kind='area', legend=False)
plt.xlabel('Month')
plt.ylabel('Average Order Revenue (EUR)')
plt.show()
def plot_stacked_graphs(cust_df: pd.DataFrame, categories: list[str]):
for c in categories:
subset =
stack_subset = customers_df[['age_bin', 'GEOGRAPHY','GENDER', 'total_revenue','number_orders']].groupby(['age_bin', 'GEOGRAPHY','GENDER']).agg(number = ('age_bin', 'count'),
total_value = ('total_revenue', 'sum'),
total_orders = ('number_orders', 'sum')).reset_index()
stack_subset = stack_subset[stack_subset.number > 0]
stack_subset
| age_bin | GEOGRAPHY | GENDER | number | total_value | total_orders | |
|---|---|---|---|---|---|---|
| 7 | 18-24 | Italy | M | 3568 | 243137.85 | 10047 |
| 17 | 25-29 | Germany | M | 1537 | 115777.08 | 4486 |
| 21 | 25-29 | Italy | M | 906 | 59957.49 | 2595 |
| 31 | 30-34 | Germany | M | 3200 | 226910.09 | 9170 |
| 42 | 35-39 | France | F | 251 | 16786.70 | 774 |
| 44 | 35-39 | Germany | F | 1151 | 81876.21 | 3331 |
| 45 | 35-39 | Germany | M | 2069 | 139316.99 | 5774 |
| 56 | 40-44 | France | F | 2390 | 163398.33 | 6681 |
| 68 | 40-44 | UK | F | 806 | 65101.95 | 2610 |
| 80 | 45-49 | Spain | F | 1200 | 85769.90 | 3465 |
| 82 | 45-49 | UK | F | 1453 | 99296.66 | 4041 |
| 88 | 50-54 | Greece | F | 1445 | 99247.20 | 4316 |
| 94 | 50-54 | Spain | F | 446 | 36360.35 | 1330 |
| 102 | 55-59 | Greece | F | 817 | 50745.64 | 2196 |
| 106 | 55-59 | Netherlands | F | 314 | 19018.76 | 841 |
| 120 | 60-64 | Netherlands | F | 600 | 39297.11 | 1622 |
| 134 | 65-69 | Netherlands | F | 269 | 16038.40 | 726 |
| 148 | 70+ | Netherlands | F | 164 | 13754.22 | 501 |
def plot_stack_plots(stack_df: pd.DataFrame, cat_cols: list):
for c in cat_cols:
subset = stack_df.pivot_table(columns=c[0], index=c[1], values='number')
subset.plot(kind='bar', stacked=True, figsize=(12,6))
c_title = c[0].replace('_', ' ').capitalize()
i_title = c[1].replace('_', ' ').capitalize()
plt.title('Number of customers per {} and {}'.format(i_title, c_title))
plt.xlabel(i_title)
plt.ylabel('Number of customers')
if c[0] == 'GENDER':
plt.legend(title = c_title, labels=['Female', 'Male'])
else:
plt.legend(title=c_title)
plt.show()
plot_stack_plots(stack_subset, [('GEOGRAPHY', 'age_bin'), ('GENDER', 'age_bin'), ('GENDER', 'GEOGRAPHY')])
def compute_RFM(cust_df: pd.DataFrame, revenue_col: str, last_date_col: str, num_orders_col: str):
# Computing Recency
last_purchase_date = max(cust_df[last_date_col])
cust_df["recency"] = cust_df[last_date_col].apply(lambda x: (last_purchase_date - x).days)
# Monetary value has already been calculated, it is in the total revenue column of our dataframe
# Frequency has also been calculated and is in the number_orders column
# Calculating Recency (R), Frequency (F) and Monetary value (M) ranks
cust_df['R_rank'] = cust_df['recency'].rank(ascending=False)
cust_df['F_rank'] = cust_df[num_orders_col].rank(ascending=True)
cust_df['M_rank'] = cust_df[revenue_col].rank(ascending=True)
cust_df['R_rank_norm'] = 100*cust_df['R_rank']/cust_df['R_rank'].max()
cust_df['F_rank_norm'] = 100*cust_df['F_rank']/cust_df['F_rank'].max()
cust_df['M_rank_norm'] = 100*cust_df['M_rank']/cust_df['M_rank'].max()
cust_df.drop(columns=['R_rank', 'F_rank', 'M_rank'], inplace=True)
# Computing RFM
cust_df['RFM_score'] = 0.15*cust_df['R_rank_norm'] + 0.28*cust_df['F_rank_norm'] + 0.57*cust_df['M_rank_norm']
cust_df['RFM_score'] = cust_df['RFM_score'].apply(lambda x: round(x,2)*0.05)
# Creating customer segments
cust_df["customer_segment"] = None
cust_df.loc[(pd.isna(cust_df['customer_segment'])) & (cust_df['RFM_score'] > 4.5), 'customer_segment'] = 'Top Customer'
cust_df.loc[(pd.isna(cust_df['customer_segment'])) & (cust_df['RFM_score'] > 4), 'customer_segment'] = 'High Value Customers'
cust_df.loc[(pd.isna(cust_df['customer_segment'])) & (cust_df['RFM_score'] > 3), 'customer_segment'] = 'Medium Value Customer'
cust_df.loc[(pd.isna(cust_df['customer_segment'])) & (cust_df['RFM_score'] > 1.6), 'customer_segment'] = 'Low Value Customer'
cust_df.loc[(pd.isna(cust_df['customer_segment'])) & (cust_df['RFM_score'] <= 1.6), 'customer_segment'] = 'Lost Customer'
return cust_df.sort_index()
customers_df = compute_RFM(customers_df, 'total_revenue', 'date_max', 'number_orders')
customers_df.head(2)
| GENDER | AGE | GEOGRAPHY | number_orders | order_value_min | order_value_max | total_revenue | order_value_mean | order_value_std | order_value_median | ... | total_items_std | total_items_median | diff_items_bought | age_bin | recency | R_rank_norm | F_rank_norm | M_rank_norm | RFM_score | customer_segment | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Customer_ID | |||||||||||||||||||||
| 1 | M | 22 | Italy | 1 | 16.29 | 16.29 | 16.29 | 16.290 | 0.000000 | 16.290 | ... | 0.0 | 2.0 | 2 | 18-24 | 343 | 4.164448 | 25.509165 | 39.064022 | 1.5015 | Lost Customer |
| 2 | M | 24 | Italy | 2 | 7.77 | 15.00 | 22.77 | 11.385 | 5.112382 | 11.385 | ... | 0.0 | 1.0 | 2 | 18-24 | 195 | 42.605165 | 60.287346 | 48.926326 | 2.5580 | Low Value Customer |
2 rows × 31 columns
rfm_summary = customers_df.groupby('customer_segment').sum().reset_index()
plt.pie(rfm_summary['total_revenue'], labels=rfm_summary['customer_segment'],
pctdistance = 0.8,
labeldistance = 1.1,
autopct=lambda x: f"{x:.1f}%\n({(x/100)*sum(rfm_summary['total_revenue']):.0f} €)",
textprops={"size": 12},
radius = 1.2)
plt.legend(loc="best", bbox_to_anchor=(2.5,0), title="Country")
plt.show()
/tmp/ipykernel_912/579592388.py:1: FutureWarning: The default value of numeric_only in DataFrameGroupBy.sum is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.
rfm_summary = customers_df.groupby('customer_segment').sum().reset_index()
rfm_summary = customers_df.groupby('customer_segment').count().reset_index()
plt.pie(rfm_summary['total_revenue'], labels=rfm_summary['customer_segment'],
pctdistance = 0.7,
labeldistance = 1.1,
autopct=lambda x: f"{x:.1f}%\n({(x/100)*sum(rfm_summary['total_revenue']):.0f} customers)",
textprops={"size": 12},
radius = 1.2)
plt.legend(loc="best", bbox_to_anchor=(2.5,0), title="Country")
plt.show()
# First we need to select only relevant features
X = pd.get_dummies(customers_df, columns=['GENDER','GEOGRAPHY'])
X = X.select_dtypes(include=np.number)
X = X.dropna(axis=0)
print(X.shape)
X.head(2)
(22586, 34)
| AGE | number_orders | order_value_min | order_value_max | total_revenue | order_value_mean | order_value_std | order_value_median | diff_items_min | diff_items_max | ... | RFM_score | GENDER_F | GENDER_M | GEOGRAPHY_France | GEOGRAPHY_Germany | GEOGRAPHY_Greece | GEOGRAPHY_Italy | GEOGRAPHY_Netherlands | GEOGRAPHY_Spain | GEOGRAPHY_UK | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Customer_ID | |||||||||||||||||||||
| 1 | 22 | 1 | 16.29 | 16.29 | 16.29 | 16.290 | 0.000000 | 16.290 | 2 | 2 | ... | 1.5015 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 2 | 24 | 2 | 7.77 | 15.00 | 22.77 | 11.385 | 5.112382 | 11.385 | 1 | 1 | ... | 2.5580 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
2 rows × 34 columns
from sklearn.model_selection import train_test_split
#Creating train test split to be able to test stability
X_train, X_test = train_test_split(X, test_size=0.1, random_state=45658, shuffle=True)
X_train.head()
| AGE | number_orders | order_value_min | order_value_max | total_revenue | order_value_mean | order_value_std | order_value_median | diff_items_min | diff_items_max | ... | RFM_score | GENDER_F | GENDER_M | GEOGRAPHY_France | GEOGRAPHY_Germany | GEOGRAPHY_Greece | GEOGRAPHY_Italy | GEOGRAPHY_Netherlands | GEOGRAPHY_Spain | GEOGRAPHY_UK | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Customer_ID | |||||||||||||||||||||
| 21374 | 20 | 1 | 7.25 | 7.25 | 7.25 | 7.250000 | 0.000000 | 7.250 | 1 | 1 | ... | 1.2150 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 |
| 5399 | 49 | 1 | 15.65 | 15.65 | 15.65 | 15.650000 | 0.000000 | 15.650 | 3 | 3 | ... | 1.4540 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 |
| 8417 | 34 | 3 | 7.00 | 11.25 | 26.60 | 8.866667 | 2.171597 | 8.350 | 1 | 2 | ... | 2.7160 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
| 11299 | 47 | 1 | 19.92 | 19.92 | 19.92 | 19.920000 | 0.000000 | 19.920 | 2 | 2 | ... | 2.1100 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 |
| 20709 | 38 | 2 | 6.07 | 12.28 | 18.35 | 9.175000 | 4.391133 | 9.175 | 2 | 2 | ... | 2.5285 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 |
5 rows × 34 columns
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import FunctionTransformer
#Define a function to remove highly correlated features
def correlated_features(dataset, threshold=0.8):
correlated_columns = set()
df = pd.DataFrame(dataset)
correlations = df.corr()
for i in range(len(correlations)):
for j in range(i):
if abs(correlations.iloc[i,j]) > threshold:
correlated_columns.add(correlations.columns[i])
return correlated_columns
class CorrSelector(BaseEstimator, TransformerMixin):
def __init__(self, threshold=0.8):
self.threshold = threshold
self.correlated_cols = None
def fit(self, X, y=None):
self.correlated_cols = correlated_features(X, threshold=self.threshold)
self.correlated_indexes = [pd.DataFrame(X).columns.get_loc(c) for c in self.correlated_cols]
return self
def transform(self, X, y=None, **kwargs):
self.X = X
X_df = pd.DataFrame(X)
return X_df.drop(X_df.columns[list(self.correlated_cols)], axis=1)
def get_params(self, deep=False):
return {"threshold": self.threshold}
def get_feature_names_out(self, features_out):
feature_list = list(features_out)
for i in self.correlated_indexes:
feature_list.pop(i)
return feature_list
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import Pipeline
#Preprocessing pipeline with scaling and feature selection
preprocess_pipe = Pipeline([
('var', VarianceThreshold(threshold=0.04)),
('corr', CorrSelector()),
('scaler', MinMaxScaler())
])
X_train_tr = preprocess_pipe.fit_transform(X_train)
X_test_tr = preprocess_pipe.transform(X_test)
X_train_tr.shape
(20327, 22)
We will perform KMeans clustering to classify our customers into different segments
def cluster_stability(X, est, n_iter=10, random_state=None):
labels = []
indices = []
for i in range(n_iter):
# draw bootstrap samples, store indices
sample_indices = rng.randint(0, X.shape[0], X.shape[0])
indices.append(sample_indices)
est = clone(est)
if hasattr(est, "random_state"):
# randomize estimator if possible
est.random_state = rng.randint(1e5)
X_bootstrap = X[sample_indices]
est.fit(X_bootstrap)
# store clustering outcome using original indices
relabel = -np.ones(X.shape[0], dtype=np.int32)
relabel[sample_indices] = est.labels_
labels.append(relabel)
scores = []
for l, i in zip(labels, indices):
for k, j in zip(labels, indices):
in_both = np.intersect1d(i, j)
scores.append(adjusted_rand_score(l[in_both], k[in_both]))
return np.mean(scores)
kf = KFold(n_splits=5)
scores = []
for k in range (2,11): #Higher values of K have been tested before full implementation and have lower performance
labels = []
indices = []
scores_stab = []
SSD = []
SIL = []
for train_index, test_index in kf.split(X_train_tr):
X_train = X_train_tr[train_index]
X_test = X_train_tr[test_index]
km_tr = KMeans(n_clusters=k, random_state=10, n_init = 10).fit(X_train)
cluster_labels = km_tr.labels_
ssd = km_tr.inertia_
sil = silhouette_score(X_train, cluster_labels)
SSD.append(ssd)
SIL.append(sil)
print("K: {}, Silhouette: {}".format(k, sil))
#Calculating stability on the train set
indices.append(train_index)
relabel = -np.ones(X_train_tr.shape[0], dtype=np.int32)
relabel[train_index] = cluster_labels
labels.append(relabel)
gc.enable()
del X_train, X_test
gc.collect()
for l, i in zip(labels, indices):
for m, j in zip(labels, indices):
# we also compute the diagonal
in_both = np.intersect1d(i, j)
scores_stab.append(adjusted_rand_score(l[in_both], m[in_both]))
print("K: {}, SSD: {}, Silhouette: {}, Stability: {}".format(k, np.mean(SSD), np.mean(SIL), np.mean(scores_stab)))
scores.append({'K': k, 'SSD': np.mean(SSD), 'Silhouette': np.mean(SIL), 'Stability': np.mean(scores_stab)})
scores_df = pd.DataFrame(scores)
K: 2, Silhouette: 0.2516228097170912 K: 2, Silhouette: 0.25138709649399626 K: 2, Silhouette: 0.252910788366884 K: 2, Silhouette: 0.25223851290714133 K: 2, Silhouette: 0.25236665188865876 K: 2, SSD: 14785.527455661073, Silhouette: 0.25210517187475434, Stability: 1.0 K: 3, Silhouette: 0.350394411981345 K: 3, Silhouette: 0.34965346097513106 K: 3, Silhouette: 0.35034781202914256 K: 3, Silhouette: 0.35020485633944437 K: 3, Silhouette: 0.3503381105374748 K: 3, SSD: 11566.633558894715, Silhouette: 0.35018773037250756, Stability: 1.0 K: 4, Silhouette: 0.40477614970416337 K: 4, Silhouette: 0.4048421152892709 K: 4, Silhouette: 0.40381689640174806 K: 4, Silhouette: 0.40489833070019393 K: 4, Silhouette: 0.40433101089042717 K: 4, SSD: 9748.939701966283, Silhouette: 0.4045329005971607, Stability: 1.0 K: 5, Silhouette: 0.4549838247318755 K: 5, Silhouette: 0.45530356387546006 K: 5, Silhouette: 0.4537874464663286 K: 5, Silhouette: 0.4547051844814369 K: 5, Silhouette: 0.4549426564609264 K: 5, SSD: 8173.364556216188, Silhouette: 0.45474453520320546, Stability: 1.0 K: 6, Silhouette: 0.5003022650680633 K: 6, Silhouette: 0.5003740042443241 K: 6, Silhouette: 0.499208599014605 K: 6, Silhouette: 0.5001026756353519 K: 6, Silhouette: 0.49591836401190537 K: 6, SSD: 6795.001231634733, Silhouette: 0.4991811815948499, Stability: 0.9804651613823048 K: 7, Silhouette: 0.5357297604739991 K: 7, Silhouette: 0.5358604728737244 K: 7, Silhouette: 0.5355126897711169 K: 7, Silhouette: 0.5352355174634308 K: 7, Silhouette: 0.5357054844263481 K: 7, SSD: 5666.994592789537, Silhouette: 0.5356087850017238, Stability: 1.0 K: 8, Silhouette: 0.4784735587251483 K: 8, Silhouette: 0.4790929924522786 K: 8, Silhouette: 0.47965252498732697 K: 8, Silhouette: 0.47836958750527075 K: 8, Silhouette: 0.478651222891815 K: 8, SSD: 4710.136590545218, Silhouette: 0.47884797731236795, Stability: 0.999253755590957 K: 9, Silhouette: 0.4612253162152353 K: 9, Silhouette: 0.46074163157312353 K: 9, Silhouette: 0.4615304158900736 K: 9, Silhouette: 0.4600039365636909 K: 9, Silhouette: 0.46016512226765033 K: 9, SSD: 4189.253378409412, Silhouette: 0.4607332845019547, Stability: 0.9982089625338378 K: 10, Silhouette: 0.44812749533132656 K: 10, Silhouette: 0.44795054445324645 K: 10, Silhouette: 0.449323798890992 K: 10, Silhouette: 0.4469403914148095 K: 10, Silhouette: 0.447289863364888 K: 10, SSD: 3884.028523241775, Silhouette: 0.4479264186910525, Stability: 0.9976678385306192
#Plotting our Elbow Score
ax = plt.figure().gca()
plt.plot(scores_df.K, scores_df.SSD,'x-', color="blue", label = "SSD")
plt.xlabel('Values of K')
plt.ylabel('Sum of squared distances')
plt.title('Elbow Method For Optimal k')
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
plt.show()
#Displaying the Silhouette score
ax = plt.figure().gca()
plt.plot(scores_df.K,scores_df.Silhouette,'x-', color="green", label="Silhouette score" )
plt.xlabel('Values of K')
plt.ylabel('Silhouette score')
plt.title('Silhouette analysis For Optimal k')
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
plt.show()
#Displaying our Stability Index
ax = plt.figure().gca()
plt.plot(scores_df.K,scores_df.Stability,'x-', color="red", label="Stability Index" )
plt.xlabel('Values of K')
plt.ylabel('Stability Index')
plt.title('Stability analysis For Optimal k')
ax.xaxis.set_major_locator(MaxNLocator(integer=True))
plt.show()
%%capture --no-display
from clusteval import clusteval
ce = clusteval(cluster='kmeans')
results = ce.fit(X_train_tr)
print(results['labx'])
ce.plot()
ce.scatter(X_train_tr)
(<Figure size 2500x1500 with 1 Axes>, <Axes: xlabel='x-axis', ylabel='y-axis'>)
#Selecting 7 as our best amount of clusters
km = KMeans(n_clusters=7, n_init=10)
km_clusters = km.fit_predict(X_train_tr)
n_components = 10
# Create a PCA model to reduce our data to 2 dimensions for visualisation
pca = PCA(n_components = n_components, n_oversamples=10)
pca.fit(X_train_tr)
display_scree_plot(pca)
#Restricting to 7 components to have 90% of explained variance
n_components = 7
pca = PCA(n_components = n_components, n_oversamples=10)
pca.fit(X_train_tr)
# Transfor the scaled data to the new PCA space
X_train_red = pca.transform(X_train_tr)
centres_reduced = pca.transform(km.cluster_centers_)
for i in range(1,n_components):
display_factorial_planes(X_train_red, n_components, pca, [(i-1,i)], illustrative_var = clusters, alpha = 0.8)
plt.scatter(centres_reduced[:, i-1], centres_reduced[:, i],
marker='x', s=169, linewidths=3,
color='r', zorder=10)
plt.tight_layout()
plt.show()
def display_parallel_coordinates_plot(X_tr: pd.DataFrame, cluster_col: str):
#Removing unclustered datapoints
df_mean = X_tr.drop(X_tr[X_tr[cluster_col] == -1].index)
#Aggregating by cluster and calculating the means of each fields
df_mean = X_tr.groupby(cluster_col).mean()
#Rescaling data for better visualization
df_mean_scaled = StandardScaler().fit_transform(df_mean)
#Transforming our data back into a dataframe
df_mean = pd.DataFrame(df_mean_scaled, columns=df_mean.columns.to_list()).reset_index()
df_mean.rename(columns={"index": "cluster"}, inplace=True)
#Adding 1 to the number of cluster just to have numbers from 1 to 7
df_mean["cluster"] = df_mean["cluster"] + 1
clusters = df_mean["cluster"]
for k in range(1, len(X_tr.columns)//8+2):
i = k*9
if i > len(X_tr):
subset = df_mean.iloc[:, -9:]
subset["cluster"] = clusters
elif k==1:
subset = df_mean.iloc[:,(i-9):i]
else:
subset = df_mean.iloc[:,(i-9):i]
subset["cluster"] = clusters
display_parallel_coordinates_centroids(subset, 7)
plt.show()
X_tr = preprocess_pipe.fit_transform(X)
km = KMeans(n_clusters=7, n_init='auto')
km_clusters= km.fit_predict(X_tr)
cols = list(X.drop(X.columns[list(preprocess_pipe[-2].correlated_cols)], axis=1).columns)
X_tr = pd.DataFrame(X_tr, columns = cols)
X_tr['km_cluster'] = km_clusters
display_parallel_coordinates_plot(X_tr, 'km_cluster')
To tune the hyperparameter of the DBSCAN algorithm, we will use a starting point the heuristics defined in the latest DBSCAN paper (https://www.ccs.neu.edu/home/vip/teach/DMcourse/2_cluster_EM_mixt/notes_slides/revisitofrevisitDBSCAN.pdf). For setting eps, it is advised to perform kdist plotting using the NearestNeighbors algorithm and to set :
k = 2*number of features -1
For min_samples, the starting point should generally be twice the number of features, or 46 in that case.
#Defining kdist plot
from sklearn.neighbors import NearestNeighbors
def get_kdist_plot(X=None, k=None, radius_nbrs=1.0):
nbrs = NearestNeighbors(n_neighbors=k, radius=radius_nbrs).fit(X)
# For each point, compute distances to its k-nearest neighbors
distances, indices = nbrs.kneighbors(X)
distances = np.sort(distances, axis=0)
distances = distances[:, k-1]
# Plot the sorted K-nearest neighbor distance for each point in the dataset
plt.figure(figsize=(8,8))
plt.plot(distances)
plt.xlabel('Points/Objects in the dataset', fontsize=12)
plt.ylabel('Sorted {}-nearest neighbor distance'.format(k), fontsize=12)
plt.grid(True, linestyle="--", color='black', alpha=0.4)
plt.show()
plt.close()
#Defining K as mentioned in the article
k = 2 * X_tr.shape[-1] - 1 # k=2*{dim(dataset)} - 1
get_kdist_plot(X=X_train_tr, k=k)
We see in the kdist plot above that the elbow is at around 0.2, so we will look for an eps at around 0.2
#Specify parameters to sample from
param_grid = list(ParameterGrid(
{
'min_samples': [10,50,100,200,300,400,500,700],
'eps' : [0.2,0.25,0.3,0.37,0.45,0.55,0.6, 0.65, 0.75, 0.85, 0.95],
'metric': ['euclidean','manhattan']
}))
scores = []
kf = KFold(n_splits=3)
SIL = []
i = 0
#Performing manual gridsearch
for params in param_grid:
labels = []
indices = []
scores_stab = []
SIL = []
for train_index, test_index in kf.split(X_train_tr):
i+=1
if i % 10 == 0:
print("{}th iteration".format(i))
X_train = X_train_tr[train_index]
X_test = X_train_tr[test_index]
dbs = DBSCAN(n_jobs=-1,
min_samples=params['min_samples'],
eps=params['eps'],
metric=params['metric']).fit(X_train)
clusters = dbs.labels_
if len(np.unique(clusters)) > 1:
sil = silhouette_score(X_train, clusters)
else:
sil = 0
SIL.append(sil)
#Calculating stability on the train set
indices.append(train_index)
relabel = -np.ones(X_train_tr.shape[0], dtype=np.int32)
relabel[train_index] = clusters
labels.append(relabel)
min_samples_pct = params['min_samples'] / len(X_train)
gc.enable()
del X_train, X_test
gc.collect()
for k, l in zip(labels, indices):
for m, j in zip(labels, indices):
# we also compute the diagonal
in_both = np.intersect1d(l, j)
scores_stab.append(adjusted_rand_score(k[in_both], m[in_both]))
print("min_samples: {}, eps: {}, metric: {} ==> Silhouette: {}, Stability {}".format(
params['min_samples'], params['eps'], params['metric'], np.mean(SIL), np.mean(scores_stab)))
scores.append({'min_samples': params['min_samples'], 'min_samples_pct': min_samples_pct,'eps': params['eps'],
'metric': params['metric'],'Silhouette': np.mean(SIL), 'Stability': np.mean(scores_stab)})
scores_dbs = pd.DataFrame(scores)
min_samples: 10, eps: 0.2, metric: euclidean ==> Silhouette: 0.38134778158869814, Stability 0.9920014439208704 min_samples: 50, eps: 0.2, metric: euclidean ==> Silhouette: 0.24648134671033828, Stability 0.95949091407407 min_samples: 100, eps: 0.2, metric: euclidean ==> Silhouette: 0.10522103800755117, Stability 0.926277917355204 10th iteration min_samples: 200, eps: 0.2, metric: euclidean ==> Silhouette: 0.027902347936477116, Stability 0.9608240732452269 min_samples: 300, eps: 0.2, metric: euclidean ==> Silhouette: 0.0005950667817801921, Stability 0.9082881513443863 min_samples: 400, eps: 0.2, metric: euclidean ==> Silhouette: -0.01497786762477805, Stability 0.9793575445717396 20th iteration min_samples: 500, eps: 0.2, metric: euclidean ==> Silhouette: -0.020783028990012706, Stability 0.9719443450585928 min_samples: 700, eps: 0.2, metric: euclidean ==> Silhouette: 0.0, Stability 1.0 min_samples: 10, eps: 0.2, metric: manhattan ==> Silhouette: -0.15047993923409822, Stability 0.8959281181543565 30th iteration min_samples: 50, eps: 0.2, metric: manhattan ==> Silhouette: -0.09383031653530881, Stability 0.9522280398796654 min_samples: 100, eps: 0.2, metric: manhattan ==> Silhouette: -0.06263888790570372, Stability 0.9647951285145102 min_samples: 200, eps: 0.2, metric: manhattan ==> Silhouette: -0.023346156152077413, Stability 0.5555555555555556 min_samples: 300, eps: 0.2, metric: manhattan ==> Silhouette: 0.0, Stability 1.0 40th iteration min_samples: 400, eps: 0.2, metric: manhattan ==> Silhouette: 0.0, Stability 1.0 min_samples: 500, eps: 0.2, metric: manhattan ==> Silhouette: 0.0, Stability 1.0 min_samples: 700, eps: 0.2, metric: manhattan ==> Silhouette: 0.0, Stability 1.0 50th iteration min_samples: 10, eps: 0.25, metric: euclidean ==> Silhouette: 0.3982836263677331, Stability 0.9970550543555705 min_samples: 50, eps: 0.25, metric: euclidean ==> Silhouette: 0.36500438319482603, Stability 0.9887607675109934 min_samples: 100, eps: 0.25, metric: euclidean ==> Silhouette: 0.29159797396019854, Stability 0.9705232485064607 60th iteration min_samples: 200, eps: 0.25, metric: euclidean ==> Silhouette: 0.14093019755205763, Stability 0.9727441564529086 min_samples: 300, eps: 0.25, metric: euclidean ==> Silhouette: 0.0859336619595562, Stability 0.9768741399291151 min_samples: 400, eps: 0.25, metric: euclidean ==> Silhouette: 0.023304457474252274, Stability 0.9713794069135733 min_samples: 500, eps: 0.25, metric: euclidean ==> Silhouette: 0.0739882410457942, Stability 0.9925693951187488 70th iteration min_samples: 700, eps: 0.25, metric: euclidean ==> Silhouette: 0.0041334791395695155, Stability 0.9939271488993567 min_samples: 10, eps: 0.25, metric: manhattan ==> Silhouette: 0.049708242448888974, Stability 0.9171695297672338 min_samples: 50, eps: 0.25, metric: manhattan ==> Silhouette: -0.06295188157820629, Stability 0.9442693060596241 80th iteration min_samples: 100, eps: 0.25, metric: manhattan ==> Silhouette: -0.08446023182647659, Stability 0.974585096037371 min_samples: 200, eps: 0.25, metric: manhattan ==> Silhouette: -0.0024070120744104956, Stability 0.9706368767157814 min_samples: 300, eps: 0.25, metric: manhattan ==> Silhouette: -0.03699936077591547, Stability 0.5262985291216826 90th iteration min_samples: 400, eps: 0.25, metric: manhattan ==> Silhouette: 0.0, Stability 1.0 min_samples: 500, eps: 0.25, metric: manhattan ==> Silhouette: 0.0, Stability 1.0 min_samples: 700, eps: 0.25, metric: manhattan ==> Silhouette: 0.0, Stability 1.0 min_samples: 10, eps: 0.3, metric: euclidean ==> Silhouette: 0.4039953369441629, Stability 0.998112421136881 100th iteration min_samples: 50, eps: 0.3, metric: euclidean ==> Silhouette: 0.3964786993772072, Stability 0.9967032699612643 min_samples: 100, eps: 0.3, metric: euclidean ==> Silhouette: 0.3674653419032725, Stability 0.9904813371697103 min_samples: 200, eps: 0.3, metric: euclidean ==> Silhouette: 0.2552507566808017, Stability 0.9223478170454708 110th iteration min_samples: 300, eps: 0.3, metric: euclidean ==> Silhouette: 0.15330446671732986, Stability 0.9608065122715735 min_samples: 400, eps: 0.3, metric: euclidean ==> Silhouette: 0.13102584221975563, Stability 0.9588476810263656 min_samples: 500, eps: 0.3, metric: euclidean ==> Silhouette: 0.0786189020048599, Stability 0.9881572684396135 120th iteration min_samples: 700, eps: 0.3, metric: euclidean ==> Silhouette: 0.0815704094073396, Stability 0.9956280861830132 min_samples: 10, eps: 0.3, metric: manhattan ==> Silhouette: 0.2608630914338302, Stability 0.9567454654275068 min_samples: 50, eps: 0.3, metric: manhattan ==> Silhouette: 0.05514733587112183, Stability 0.9385612642003941 min_samples: 100, eps: 0.3, metric: manhattan ==> Silhouette: 0.005719533518025492, Stability 0.9527084938884379 130th iteration min_samples: 200, eps: 0.3, metric: manhattan ==> Silhouette: -0.0028118712770695124, Stability 0.9800405435342948 min_samples: 300, eps: 0.3, metric: manhattan ==> Silhouette: 0.02934142165254429, Stability 0.9834642798036506 min_samples: 400, eps: 0.3, metric: manhattan ==> Silhouette: -0.032152601139510795, Stability 0.9731302136594228 140th iteration min_samples: 500, eps: 0.3, metric: manhattan ==> Silhouette: 0.0, Stability 1.0 min_samples: 700, eps: 0.3, metric: manhattan ==> Silhouette: 0.0, Stability 1.0 min_samples: 10, eps: 0.37, metric: euclidean ==> Silhouette: 0.4080305701103566, Stability 0.9989935765486844 150th iteration min_samples: 50, eps: 0.37, metric: euclidean ==> Silhouette: 0.40622044841283556, Stability 0.9986085034676354 min_samples: 100, eps: 0.37, metric: euclidean ==> Silhouette: 0.40119687149205707, Stability 0.9979363456665461 min_samples: 200, eps: 0.37, metric: euclidean ==> Silhouette: 0.35733915538294475, Stability 0.9823173575900791 min_samples: 300, eps: 0.37, metric: euclidean ==> Silhouette: 0.27750538671495384, Stability 0.9891442636868477 160th iteration min_samples: 400, eps: 0.37, metric: euclidean ==> Silhouette: 0.22423665585055677, Stability 0.9810788048562669 min_samples: 500, eps: 0.37, metric: euclidean ==> Silhouette: 0.21014300325207466, Stability 0.9899072662601848 min_samples: 700, eps: 0.37, metric: euclidean ==> Silhouette: 0.11766321211177218, Stability 0.9248159519478875 170th iteration min_samples: 10, eps: 0.37, metric: manhattan ==> Silhouette: 0.3403495639576404, Stability 0.9810157970902303 min_samples: 50, eps: 0.37, metric: manhattan ==> Silhouette: 0.1836162989361372, Stability 0.9378423333503874 min_samples: 100, eps: 0.37, metric: manhattan ==> Silhouette: 0.09142913911285705, Stability 0.9457166969691184 180th iteration min_samples: 200, eps: 0.37, metric: manhattan ==> Silhouette: 0.009952020657967084, Stability 0.9670071042542049 min_samples: 300, eps: 0.37, metric: manhattan ==> Silhouette: 0.020148779132977763, Stability 0.9838758295403649 min_samples: 400, eps: 0.37, metric: manhattan ==> Silhouette: 0.05476471862509246, Stability 0.989068049829227 min_samples: 500, eps: 0.37, metric: manhattan ==> Silhouette: 0.013847765779870307, Stability 0.910904441032574 190th iteration min_samples: 700, eps: 0.37, metric: manhattan ==> Silhouette: 0.0, Stability 1.0 min_samples: 10, eps: 0.45, metric: euclidean ==> Silhouette: 0.4092583587805538, Stability 0.9995098529963923 min_samples: 50, eps: 0.45, metric: euclidean ==> Silhouette: 0.4088902453709659, Stability 0.9996998365123666 200th iteration min_samples: 100, eps: 0.45, metric: euclidean ==> Silhouette: 0.40795613138608045, Stability 0.9996007222399506 min_samples: 200, eps: 0.45, metric: euclidean ==> Silhouette: 0.40035701989472144, Stability 0.9983077659415414 min_samples: 300, eps: 0.45, metric: euclidean ==> Silhouette: 0.3671471273128302, Stability 0.9947660679433254 210th iteration min_samples: 400, eps: 0.45, metric: euclidean ==> Silhouette: 0.29525403057467486, Stability 0.9874497259990606 min_samples: 500, eps: 0.45, metric: euclidean ==> Silhouette: 0.2526075415147549, Stability 0.926408744486261 min_samples: 700, eps: 0.45, metric: euclidean ==> Silhouette: 0.2281512190743528, Stability 0.9910743124552636 min_samples: 10, eps: 0.45, metric: manhattan ==> Silhouette: 0.37942945420436996, Stability 0.9908235911942376 220th iteration min_samples: 50, eps: 0.45, metric: manhattan ==> Silhouette: 0.30614474358625965, Stability 0.9734313871793563 min_samples: 100, eps: 0.45, metric: manhattan ==> Silhouette: 0.20867318867368886, Stability 0.9299612774131449 min_samples: 200, eps: 0.45, metric: manhattan ==> Silhouette: 0.12149897336455918, Stability 0.9767484490042904 230th iteration min_samples: 300, eps: 0.45, metric: manhattan ==> Silhouette: 0.06119889500050605, Stability 0.979432701855139 min_samples: 400, eps: 0.45, metric: manhattan ==> Silhouette: 0.05191947280199283, Stability 0.9729744848146381 min_samples: 500, eps: 0.45, metric: manhattan ==> Silhouette: 0.07440578423098068, Stability 0.9969264931516675 240th iteration min_samples: 700, eps: 0.45, metric: manhattan ==> Silhouette: 0.038133048171239574, Stability 0.96653321995092 min_samples: 10, eps: 0.55, metric: euclidean ==> Silhouette: 0.535016340762694, Stability 0.9995590127462335 min_samples: 50, eps: 0.55, metric: euclidean ==> Silhouette: 0.5346635110292332, Stability 0.9996599122142378 min_samples: 100, eps: 0.55, metric: euclidean ==> Silhouette: 0.5343020466109144, Stability 0.9996861022151032 250th iteration min_samples: 200, eps: 0.55, metric: euclidean ==> Silhouette: 0.5333722430802531, Stability 0.9997199515705728 min_samples: 300, eps: 0.55, metric: euclidean ==> Silhouette: 0.5189513534925299, Stability 0.996982375891034 min_samples: 400, eps: 0.55, metric: euclidean ==> Silhouette: 0.48565276271834995, Stability 0.9986415309431236 260th iteration min_samples: 500, eps: 0.55, metric: euclidean ==> Silhouette: 0.48807689697748846, Stability 0.9971645968483516 min_samples: 700, eps: 0.55, metric: euclidean ==> Silhouette: 0.3551982592510081, Stability 0.9278486168742273 min_samples: 10, eps: 0.55, metric: manhattan ==> Silhouette: 0.5093191267007352, Stability 0.9964140283170259 270th iteration min_samples: 50, eps: 0.55, metric: manhattan ==> Silhouette: 0.47298778178898654, Stability 0.9918778654924105 min_samples: 100, eps: 0.55, metric: manhattan ==> Silhouette: 0.4149458973404591, Stability 0.9871435604018988 min_samples: 200, eps: 0.55, metric: manhattan ==> Silhouette: 0.2836109799010655, Stability 0.9791031025327493 min_samples: 300, eps: 0.55, metric: manhattan ==> Silhouette: 0.20525420203044975, Stability 0.9754836324543016 280th iteration min_samples: 400, eps: 0.55, metric: manhattan ==> Silhouette: 0.09077813905757605, Stability 0.9674628672279554 min_samples: 500, eps: 0.55, metric: manhattan ==> Silhouette: 0.10065797106310888, Stability 0.9836884571383833 min_samples: 700, eps: 0.55, metric: manhattan ==> Silhouette: 0.0901388789622762, Stability 0.9894757295309423 290th iteration min_samples: 10, eps: 0.6, metric: euclidean ==> Silhouette: 0.5351642020694914, Stability 0.9996578811111547 min_samples: 50, eps: 0.6, metric: euclidean ==> Silhouette: 0.5350759624219149, Stability 0.9998303121534298 min_samples: 100, eps: 0.6, metric: euclidean ==> Silhouette: 0.5348752950229257, Stability 0.9996877823952839 300th iteration min_samples: 200, eps: 0.6, metric: euclidean ==> Silhouette: 0.5343718796770794, Stability 0.9997555147967686 min_samples: 300, eps: 0.6, metric: euclidean ==> Silhouette: 0.5326087529604485, Stability 0.9995890552872938 min_samples: 400, eps: 0.6, metric: euclidean ==> Silhouette: 0.5228783271874636, Stability 0.9991879450997396 min_samples: 500, eps: 0.6, metric: euclidean ==> Silhouette: 0.5045159999120745, Stability 0.9941656610843642 310th iteration min_samples: 700, eps: 0.6, metric: euclidean ==> Silhouette: 0.45681517127427673, Stability 0.9944203231951396 min_samples: 10, eps: 0.6, metric: manhattan ==> Silhouette: 0.5147547382998995, Stability 0.9969842288880295 min_samples: 50, eps: 0.6, metric: manhattan ==> Silhouette: 0.4958494134807528, Stability 0.9939680407043981 320th iteration min_samples: 100, eps: 0.6, metric: manhattan ==> Silhouette: 0.447447303365633, Stability 0.9888118724598836 min_samples: 200, eps: 0.6, metric: manhattan ==> Silhouette: 0.31808734237418174, Stability 0.9760447048617146 min_samples: 300, eps: 0.6, metric: manhattan ==> Silhouette: 0.24207011715342222, Stability 0.9746973653751895 330th iteration min_samples: 400, eps: 0.6, metric: manhattan ==> Silhouette: 0.18792485507610354, Stability 0.9541878600041355 min_samples: 500, eps: 0.6, metric: manhattan ==> Silhouette: 0.09662715249517105, Stability 0.9723329299623037 min_samples: 700, eps: 0.6, metric: manhattan ==> Silhouette: 0.08804046795968841, Stability 0.8918901754250427 min_samples: 10, eps: 0.65, metric: euclidean ==> Silhouette: 0.5352247063017351, Stability 0.9997512802905989 340th iteration min_samples: 50, eps: 0.65, metric: euclidean ==> Silhouette: 0.5352138451166903, Stability 0.9998349112471742 min_samples: 100, eps: 0.65, metric: euclidean ==> Silhouette: 0.5351638488650604, Stability 0.9997966682716792 min_samples: 200, eps: 0.65, metric: euclidean ==> Silhouette: 0.5349152254545145, Stability 0.9999878438328828 350th iteration min_samples: 300, eps: 0.65, metric: euclidean ==> Silhouette: 0.5343467118562225, Stability 0.9998999145017399 min_samples: 400, eps: 0.65, metric: euclidean ==> Silhouette: 0.5298075057002981, Stability 0.9995314406663661 min_samples: 500, eps: 0.65, metric: euclidean ==> Silhouette: 0.5217255405260298, Stability 0.9993764235100898 360th iteration min_samples: 700, eps: 0.65, metric: euclidean ==> Silhouette: 0.4866674526253914, Stability 0.9975524544853879 min_samples: 10, eps: 0.65, metric: manhattan ==> Silhouette: 0.5189026245950861, Stability 0.9972289782566714 min_samples: 50, eps: 0.65, metric: manhattan ==> Silhouette: 0.5064670017148066, Stability 0.9961425234549094 min_samples: 100, eps: 0.65, metric: manhattan ==> Silhouette: 0.47671688490236136, Stability 0.9914521047745127 370th iteration min_samples: 200, eps: 0.65, metric: manhattan ==> Silhouette: 0.3589489431235701, Stability 0.975596082029796 min_samples: 300, eps: 0.65, metric: manhattan ==> Silhouette: 0.31132932963380267, Stability 0.981007068525204 min_samples: 400, eps: 0.65, metric: manhattan ==> Silhouette: 0.2406027269946391, Stability 0.9849451653340751 380th iteration min_samples: 500, eps: 0.65, metric: manhattan ==> Silhouette: 0.1990880924859426, Stability 0.9824009775901901 min_samples: 700, eps: 0.65, metric: manhattan ==> Silhouette: 0.11802185793951958, Stability 0.9871430315250751 min_samples: 10, eps: 0.75, metric: euclidean ==> Silhouette: 0.5353896713890319, Stability 0.9999569952009715 390th iteration min_samples: 50, eps: 0.75, metric: euclidean ==> Silhouette: 0.5353960081663772, Stability 0.9998764709064033 min_samples: 100, eps: 0.75, metric: euclidean ==> Silhouette: 0.5353619726070854, Stability 0.9998257042725318 min_samples: 200, eps: 0.75, metric: euclidean ==> Silhouette: 0.5353778347258724, Stability 0.9999089720858392 min_samples: 300, eps: 0.75, metric: euclidean ==> Silhouette: 0.5352064557197957, Stability 0.9998641967988833 400th iteration min_samples: 400, eps: 0.75, metric: euclidean ==> Silhouette: 0.5349506573634727, Stability 0.9999003426432691 min_samples: 500, eps: 0.75, metric: euclidean ==> Silhouette: 0.5332429213729569, Stability 0.999663125202294 min_samples: 700, eps: 0.75, metric: euclidean ==> Silhouette: 0.5278709652487895, Stability 0.9980164376840983 410th iteration min_samples: 10, eps: 0.75, metric: manhattan ==> Silhouette: 0.5249449267988474, Stability 0.9980093219613799 min_samples: 50, eps: 0.75, metric: manhattan ==> Silhouette: 0.5180263187574876, Stability 0.9972332394748366 min_samples: 100, eps: 0.75, metric: manhattan ==> Silhouette: 0.5084650618046279, Stability 0.9965679949459118 420th iteration min_samples: 200, eps: 0.75, metric: manhattan ==> Silhouette: 0.4549190116366683, Stability 0.9933610776268288 min_samples: 300, eps: 0.75, metric: manhattan ==> Silhouette: 0.368336051610334, Stability 0.9752084608913221 min_samples: 400, eps: 0.75, metric: manhattan ==> Silhouette: 0.33753701041058254, Stability 0.9876837454479452 min_samples: 500, eps: 0.75, metric: manhattan ==> Silhouette: 0.25680523691335694, Stability 0.9561564120838209 430th iteration min_samples: 700, eps: 0.75, metric: manhattan ==> Silhouette: 0.227537223274369, Stability 0.9894573256698672 min_samples: 10, eps: 0.85, metric: euclidean ==> Silhouette: 0.5354934218660132, Stability 1.0 min_samples: 50, eps: 0.85, metric: euclidean ==> Silhouette: 0.5354934218660132, Stability 1.0 440th iteration min_samples: 100, eps: 0.85, metric: euclidean ==> Silhouette: 0.5354934218660132, Stability 1.0 min_samples: 200, eps: 0.85, metric: euclidean ==> Silhouette: 0.5354934218660132, Stability 1.0 min_samples: 300, eps: 0.85, metric: euclidean ==> Silhouette: 0.5354934218660132, Stability 1.0 450th iteration min_samples: 400, eps: 0.85, metric: euclidean ==> Silhouette: 0.5353094895560568, Stability 0.9999359935437886 min_samples: 500, eps: 0.85, metric: euclidean ==> Silhouette: 0.5350738727647902, Stability 0.9999714015864972 min_samples: 700, eps: 0.85, metric: euclidean ==> Silhouette: 0.5328756993390827, Stability 0.9998424895324131 min_samples: 10, eps: 0.85, metric: manhattan ==> Silhouette: 0.5287676132770411, Stability 0.9992124123349583 460th iteration min_samples: 50, eps: 0.85, metric: manhattan ==> Silhouette: 0.5248655096055622, Stability 0.9987467476420152 min_samples: 100, eps: 0.85, metric: manhattan ==> Silhouette: 0.520228601994507, Stability 0.9977703965132173 min_samples: 200, eps: 0.85, metric: manhattan ==> Silhouette: 0.49510453127563686, Stability 0.9973006656701112 470th iteration min_samples: 300, eps: 0.85, metric: manhattan ==> Silhouette: 0.45683410872092645, Stability 0.9921624085211354 min_samples: 400, eps: 0.85, metric: manhattan ==> Silhouette: 0.38760378482480834, Stability 0.9909384518208244 min_samples: 500, eps: 0.85, metric: manhattan ==> Silhouette: 0.35849915486049183, Stability 0.9896293717689573 480th iteration min_samples: 700, eps: 0.85, metric: manhattan ==> Silhouette: 0.2591775692812997, Stability 0.9372652744727988 min_samples: 10, eps: 0.95, metric: euclidean ==> Silhouette: 0.5354923621896042, Stability 0.9999658055924486 min_samples: 50, eps: 0.95, metric: euclidean ==> Silhouette: 0.5354923621896042, Stability 0.9999658055924486 min_samples: 100, eps: 0.95, metric: euclidean ==> Silhouette: 0.5354923621896042, Stability 0.9999658055924486 490th iteration min_samples: 200, eps: 0.95, metric: euclidean ==> Silhouette: 0.5354970260462174, Stability 0.999979827350568 min_samples: 300, eps: 0.95, metric: euclidean ==> Silhouette: 0.535514680020167, Stability 1.0 min_samples: 400, eps: 0.95, metric: euclidean ==> Silhouette: 0.535514680020167, Stability 1.0 500th iteration min_samples: 500, eps: 0.95, metric: euclidean ==> Silhouette: 0.5354158550032643, Stability 1.0 min_samples: 700, eps: 0.95, metric: euclidean ==> Silhouette: 0.5348664437328728, Stability 0.9999508088061108 min_samples: 10, eps: 0.95, metric: manhattan ==> Silhouette: 0.5319269663806926, Stability 0.9992169838852352 510th iteration min_samples: 50, eps: 0.95, metric: manhattan ==> Silhouette: 0.5302351986804559, Stability 0.9989099108721852 min_samples: 100, eps: 0.95, metric: manhattan ==> Silhouette: 0.5270564999542784, Stability 0.9987657616044502 min_samples: 200, eps: 0.95, metric: manhattan ==> Silhouette: 0.5178478891742203, Stability 0.9984069584791099 min_samples: 300, eps: 0.95, metric: manhattan ==> Silhouette: 0.4912609610121122, Stability 0.9964193919611996 520th iteration min_samples: 400, eps: 0.95, metric: manhattan ==> Silhouette: 0.4575232651374328, Stability 0.981771826545186 min_samples: 500, eps: 0.95, metric: manhattan ==> Silhouette: 0.40276277985042164, Stability 0.9899110785207247 min_samples: 700, eps: 0.95, metric: manhattan ==> Silhouette: 0.3356344082032731, Stability 0.9210848341182987
scores_dbs = scores_dbs.sort_values(by="Silhouette", ascending=False)
with open('data/dbscan_scores_v3.pkl','wb') as file:
dill.dump(scores_dbs, file)
scores_dbs.head(10)
| min_samples | min_samples_pct | eps | metric | Silhouette | Stability | |
|---|---|---|---|---|---|---|
| 165 | 400 | 0.029516 | 0.95 | euclidean | 0.535515 | 1.000000 |
| 164 | 300 | 0.022137 | 0.95 | euclidean | 0.535515 | 1.000000 |
| 163 | 200 | 0.014758 | 0.95 | euclidean | 0.535497 | 0.999980 |
| 144 | 10 | 0.000738 | 0.85 | euclidean | 0.535493 | 1.000000 |
| 145 | 50 | 0.003689 | 0.85 | euclidean | 0.535493 | 1.000000 |
| 146 | 100 | 0.007379 | 0.85 | euclidean | 0.535493 | 1.000000 |
| 147 | 200 | 0.014758 | 0.85 | euclidean | 0.535493 | 1.000000 |
| 148 | 300 | 0.022137 | 0.85 | euclidean | 0.535493 | 1.000000 |
| 161 | 50 | 0.003689 | 0.95 | euclidean | 0.535492 | 0.999966 |
| 162 | 100 | 0.007379 | 0.95 | euclidean | 0.535492 | 0.999966 |
with open('data/dbscan_scores_v3.pkl', 'rb') as file:
scores_dbs = dill.load(file)
scores_dbs.head(5)
| min_samples | min_samples_pct | eps | metric | Silhouette | Stability | |
|---|---|---|---|---|---|---|
| 165 | 400 | 0.029516 | 0.95 | euclidean | 0.535515 | 1.00000 |
| 164 | 300 | 0.022137 | 0.95 | euclidean | 0.535515 | 1.00000 |
| 163 | 200 | 0.014758 | 0.95 | euclidean | 0.535497 | 0.99998 |
| 144 | 10 | 0.000738 | 0.85 | euclidean | 0.535493 | 1.00000 |
| 145 | 50 | 0.003689 | 0.85 | euclidean | 0.535493 | 1.00000 |
#Defining our final DBSCAN model with optimized hyperparameters
X_tr = preprocess_pipe.fit_transform(X)
best_results = scores_dbs.head(1)
dbs_best = DBSCAN(n_jobs=-1,
min_samples=int(best_results.min_samples_pct.values[0]*len(X_tr)),
eps=best_results.eps.values[0],
metric=best_results.metric.values[0]).fit(X_tr)
#Visualization with PCA
# Creating a PCA model to reduce our data for visualisation
n_components = 7
pca = PCA(n_components=n_components, n_oversamples=10)
pca.fit(X_tr)
#Transforming our initial dataset
X_red = pca.transform(X_tr)
clusters = dbs_best.labels_
for i in range(1,n_components):
display_factorial_planes(X_red, n_components, pca, [(i-1,i)], illustrative_var = clusters, alpha = 0.8)
plt.tight_layout()
plt.show()
silhouette_score(X_tr, clusters)
0.5359952083171928
results = pd.DataFrame([{"Model" : "DBSCAN", "Silhouette": 0.5359952083171928, "Stability": 1.000},
{"Model" : "Kmeans", "Silhouette": 0.5356087850017238, "Stability": 1}])
results = pd.melt(results, id_vars='Model')
sns.set(rc={'figure.figsize':(20,15)})
sns.catplot(x='variable', y='value', hue='Model', data=results, kind='bar', height = 8)
plt.title("Stability and Silhouette score for DBSCAN and Kmeans clustering")
plt.xlabel("Scoring metric")
plt.ylabel("Value")
plt.show()
Let's compare both our models on the complete dataset:
X_tr = preprocess_pipe.transform(X)
clusters = dbs_best.labels_
silhouette = silhouette_score(X_tr, clusters)
stability = cluster_stability(X_tr, dbs_best)
print("Silhouette score for DBSCAN on dataset: {:.4f}, Stability index : {:.3f}".format(silhouette,stability))
km = KMeans(n_clusters=7, n_init=10)
km_clusters= km.fit_predict(X_tr)
silhouette = silhouette_score(X_tr, km_clusters)
stability = cluster_stability(X_tr, km)
print("Silhouette score for Kmeans on dataset: {:.4f}, Stability index : {:.3f}".format(silhouette,stability))
Silhouette score for DBSCAN on dataset: 0.5360, Stability index : 1.000 Silhouette score for Kmeans on dataset: 0.5360, Stability index : 0.957
X_tr['cluster'] = clusters
#Studying the repartition of clusters
sub = X_tr.cluster.value_counts()
fig = go.Figure(data=[go.Pie(values=sub, labels=sub.index)])
fig.update_layout(
title={
'text': "Pie chart of the repartition of clusters",
},
title_x = 0.5,
font=dict(
size=18,
color="Black"
))
fig.show()
customers_df['cluster'] = clusters
km = KMeans(n_clusters=7, n_init=10)
km_clusters = km.fit_predict(X_tr)
customers_df['km_cluster'] = km_clusters
customers_df["customer_segment_num"] = 0
customers_df.loc[(customers_df.customer_segment_num ==0) & (customers_df['RFM_score'] > 4.5), 'customer_segment_num'] = 5
customers_df.loc[(customers_df.customer_segment_num ==0) & (customers_df['RFM_score'] > 4), 'customer_segment_num'] = 4
customers_df.loc[(customers_df.customer_segment_num ==0) & (customers_df['RFM_score'] > 3), 'customer_segment_num'] = 3
customers_df.loc[(customers_df.customer_segment_num ==0) & (customers_df['RFM_score'] > 1.6), 'customer_segment_num'] = 2
customers_df.loc[(customers_df.customer_segment_num ==0) & (customers_df['RFM_score'] <= 1.6), 'customer_segment_num'] = 1
customers_df
| GENDER | AGE | GEOGRAPHY | number_orders | order_value_min | order_value_max | total_revenue | order_value_mean | order_value_std | order_value_median | ... | age_bin | recency | R_rank_norm | F_rank_norm | M_rank_norm | RFM_score | customer_segment | cluster | km_cluster | customer_segment_num | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| Customer_ID | |||||||||||||||||||||
| 1 | M | 22 | Italy | 1 | 16.29 | 16.29 | 16.29 | 16.290 | 0.000000 | 16.290 | ... | 18-24 | 343 | 4.164448 | 25.509165 | 39.064022 | 1.5015 | Lost Customer | 0 | 1 | 1 |
| 2 | M | 24 | Italy | 2 | 7.77 | 15.00 | 22.77 | 11.385 | 5.112382 | 11.385 | ... | 18-24 | 195 | 42.605165 | 60.287346 | 48.926326 | 2.5580 | Low Value Customer | 0 | 1 | 2 |
| 3 | M | 23 | Italy | 1 | 10.92 | 10.92 | 10.92 | 10.920 | 0.000000 | 10.920 | ... | 18-24 | 363 | 0.102059 | 25.509165 | 26.963606 | 1.1265 | Lost Customer | 0 | 1 | 1 |
| 4 | M | 27 | Germany | 2 | 6.69 | 26.60 | 33.29 | 16.645 | 14.078496 | 16.645 | ... | 25-29 | 52 | 73.544551 | 60.287346 | 59.512530 | 3.0915 | Medium Value Customer | 1 | 2 | 3 |
| 5 | M | 36 | Germany | 5 | 7.06 | 20.13 | 78.82 | 15.764 | 5.429473 | 17.170 | ... | 35-39 | 179 | 45.280884 | 86.305676 | 79.465155 | 3.8125 | Medium Value Customer | 1 | 2 | 3 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 22621 | F | 50 | Spain | 1 | 9.69 | 9.69 | 9.69 | 9.690 | 0.000000 | 9.690 | ... | 50-54 | 21 | 86.477192 | 25.509165 | 23.875410 | 1.6860 | Low Value Customer | 5 | 6 | 2 |
| 22622 | F | 40 | France | 1 | 6.07 | 6.07 | 6.07 | 6.070 | 0.000000 | 6.070 | ... | 40-44 | 15 | 90.275559 | 25.509165 | 12.268662 | 1.3840 | Lost Customer | 6 | 0 | 1 |
| 22623 | F | 41 | France | 1 | 128.01 | 128.01 | 128.01 | 128.010 | 0.000000 | 128.010 | ... | 40-44 | 10 | 93.914182 | 25.509165 | 87.186753 | 3.5465 | Medium Value Customer | 6 | 0 | 3 |
| 22624 | M | 26 | Italy | 1 | 19.60 | 19.60 | 19.60 | 19.600 | 0.000000 | 19.600 | ... | 25-29 | 29 | 82.330493 | 25.509165 | 44.284070 | 2.2365 | Low Value Customer | 0 | 1 | 2 |
| 22625 | M | 36 | Germany | 4 | 1.11 | 59.50 | 83.62 | 20.905 | 26.274122 | 11.505 | ... | 35-39 | 0 | 100.000000 | 81.643496 | 80.596387 | 4.1900 | High Value Customers | 1 | 2 | 4 |
22586 rows × 34 columns
customers_num = pd.get_dummies(customers_df, columns=['GENDER','GEOGRAPHY'])
customers_num = customers_num.select_dtypes(include=np.number)
def display_parallel_coordinates_plot(X_tr: pd.DataFrame, cluster_col: str):
#Removing unclustered datapoints
df_mean = X_tr.drop(X_tr[X_tr[cluster_col] == -1].index)
#Aggregating by cluster and calculating the means of each fields
df_mean = df_mean.groupby(cluster_col).mean(numeric_only=True)
#Rescaling data for better visualization
df_mean_scaled = StandardScaler().fit_transform(df_mean)
#Transforming our data back into a dataframe
df_mean = pd.DataFrame(df_mean_scaled, columns=df_mean.columns.to_list()).reset_index()
if 'cluster' in list(df_mean.columns):
df_mean.drop(columns='cluster', inplace=True)
df_mean.rename(columns={"index": "cluster"}, inplace=True)
#Adding 1 to the number of cluster just to have numbers from 1 to 7
if min(df_mean["cluster"]) == 0:
df_mean["cluster"] = df_mean["cluster"] + 1
clusters = df_mean["cluster"]
for k in range(1,len(X_tr.columns)//9+1):
i = k*9
if i > len(X_tr):
subset = df_mean.iloc[:, -9:]
subset["cluster"] = clusters
elif k==1:
subset = df_mean.iloc[:,(i-9):i]
else:
subset = df_mean.iloc[:,(i-9):i]
subset["cluster"] = clusters
display_parallel_coordinates_centroids(subset, 6)
plt.show()
display_parallel_coordinates_plot(customers_num, 'cluster')
customers_df[customers_df.cluster==1].GENDER.value_counts()
M 6806 F 1151 Name: GENDER, dtype: int64
7 customer segments have been identified :
display_parallel_coordinates_plot(customers_num, 'customer_segment_num')
5 segments have been identified :
2 different marketing campaigns can be set in place to target highest revenue customers :
customers_df[customers_df.customer_segment_num == 4].GEOGRAPHY.value_counts()
Germany 694 Italy 399 France 226 UK 181 Greece 172 Spain 155 Netherlands 97 Name: GEOGRAPHY, dtype: int64
customers_df.to_csv('data/customers_df_final.csv')
transactions.to_csv('data/transactions_final.csv')
num_customers = len(orders.Customer_ID.unique())
churn_df = orders.groupby("Customer_ID").nunique()[["Transaction_ID"]]
churn_rate = 1- len(churn_df[churn_df.Transaction_ID > 1])/num_customers
churn_rate
0.5101390241742673
apf = len(orders.Transaction_ID.unique())/num_customers
apf
2.856017001682458
apv = orders.Sales_Amount.sum()/num_customers
apv
69.59138094394757
clv = apv*apf/churn_rate
clv
389.60784752389185